数据读取

分别将训练和测试数据读取出来

#清理当前工作空间
rm(list=ls())   

#使用路径方法,解决写死路径的问题
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

#读取数据
data=read.csv("aug_train.csv",header=T)  #读取csv格式的数据,并赋值给data
data_train=read.csv("aug_test.csv",header=T)  #读取csv格式的数据,并赋值给data

#数据清理,删除雇佣id和城市id,对于数据分析没有意义
data=data[,-(1:2)]  #删除第1列到第2列的数据(-:代表删除(*,*)第1个*表示行,第2个*表示列)
data_train=data_train[,-(1:2)]

names(data)=c("city_dev","gender","expenience","university","edu_level","major","expenience_year","company_size","company_type","jobs","train_hours","target") #重新命名
names(data_train)=c("city_dev","gender","expenience","university","edu_level","major","expenience_year","company_size","company_type","jobs","train_hours","target") #重新命名

#构造一个函数 参数v-一列数据 功能--获取数据中的众数)
getmode <- function(v) { 
  uniqv <- unique(v) # 获取唯一值
  uniqv[which.max(tabulate(match(v, uniqv)))] # 得到众数
}


# 统计函数
descrb = function(var){
  Z=data[,var]
  N=tapply(data$target,Z,length)
  MU=tapply(data$target,Z,mean)
  SD=tapply(data$target,Z,sd)
  MIN=tapply(data$target,Z,min)
  MED=tapply(data$target,Z,median)
  MAX=tapply(data$target,Z,max)
  out=cbind(N,MU,SD,MIN,MED,MAX)
  out
}

数据预处理

填充空白值

对于数值类型变量,使用均值进行填充;对于分类变量,使用众数进行填充。 这里我们得到了训练数据员工离职率的均值为0.249,后面进行预测的时候会以此阈值进行计算。

city_dev_avg=mean(data[,1])  #获取表中city_dev的平均数
gender_med=getmode(data[,2])  #获取表中gender的众数
expenience_med=getmode(data[,3])  #获取表中expenience的众数
university_med=getmode(data[,4])  #获取表中university的众数
edu_level_med=getmode(data[,5])  #获取表中edu_level的众数
major_med=getmode(data[,6])  #获取表中major的众数
expenience_year_med=getmode(data[,7])  #获取表中expenience_year的众数
# company_size_med=getmode(data[,8])  #获取表中company_size的众数,目前发现空字符串是最多的,这里就虚拟出一个other
company_size_med="Other" # 虚拟出other
company_type_med=getmode(data[,9])  #获取表中company_type的众数
jobs_med=getmode(data[,10])  #获取表中jobs的众数
train_hours_med=mean(data[,11])  #获取表中train_hours的平均数

summary(data$target) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.249   0.000   1.000
# 查看一看员工离职率的均值,目前是 0.249,也就是说,如果大于0.249。就有离职的可能性
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  0.000   0.000   0.000   0.249   0.000   1.000 

# 数据处理
data[which(data$gender==""),"gender"] = gender_med # 为空的补上众值
data[which(data$expenience==""),"expenience"] = expenience_med # 为空的补上众值
data[which(data$university==""),"university"] = university_med # 为空的补上众值
data[which(data$edu_level==""),"edu_level"] = edu_level_med # 为空的补上众值
data[which(data$major==""),"major"] = major_med # 为空的补上众值
data[which(data$expenience_year==""),"expenience_year"] = expenience_year_med # 为空的补上众值
data[which(data$company_size==""),"company_size"] = company_size_med # 为空的补上众值
data[which(data$company_type==""),"company_type"] = company_type_med # 为空的补上众值
data[which(data$jobs==""),"jobs"] = jobs_med # 为空的补上众值
data$train_hours_scale=log(data$train_hours) # 先对数据进行对数计算,然后再进行标准化,为了和城市发展指数统一量纲

# 测试数据
data_train[which(data_train$gender==""),"gender"] = gender_med # 为空的补上众值
data_train[which(data_train$expenience==""),"expenience"] = expenience_med # 为空的补上众值
data_train[which(data_train$university==""),"university"] = university_med # 为空的补上众值
data_train[which(data_train$edu_level==""),"edu_level"] = edu_level_med # 为空的性别补上众值
data_train[which(data_train$major==""),"major"] = major_med # 为空的性别补上众值
data_train[which(data_train$expenience_year==""),"expenience_year"] = expenience_year_med # 为空的性别补上众值
data_train[which(data_train$company_size==""),"company_size"] = company_size_med # 为空的性别补上众值
data_train[which(data_train$company_type==""),"company_type"] = company_type_med # 为空的性别补上众值
data_train[which(data_train$jobs==""),"jobs"] = jobs_med # 为空的性别补上众值
data_train$train_hours_scale=log(data_train$train_hours) # 数据归一化,为了和测试发展指数量纲统一

# 查看序列数据情况
unique(data$gender) # "Male"   "Female" "Other" 
## [1] "Male"   "Female" "Other"
unique(data$expenience) #  "Has relevent experience" "No relevent experience" 
## [1] "Has relevent experience" "No relevent experience"
unique(data$university) # "no_enrollment"    "Full time course" "Part time course"
## [1] "no_enrollment"    "Full time course" "Part time course"
unique(data$edu_level) # "Graduate"       "Masters"        "High School"     "Phd"            "Primary School"
## [1] "Graduate"       "Masters"        "High School"    "Phd"           
## [5] "Primary School"
unique(data$major) # "STEM"            "Business Degree" "Arts"            "Humanities"      "No Major"        "Other"      
## [1] "STEM"            "Other"           "No Major"        "Business Degree"
## [5] "Arts"            "Humanities"
sort(unique(data$expenience_year)) # "<1"  ">20" "1"   "10"  "11"  "12"  "13"  "14"  "15"  "16"  "17"  "18"  "19"  "2"   "20"  "3"   "4"   "5"   "6"   "7"   "8"   "9"  
##  [1] "<1"  ">20" "1"   "10"  "11"  "12"  "13"  "14"  "15"  "16"  "17"  "18" 
## [13] "19"  "2"   "20"  "3"   "4"   "5"   "6"   "7"   "8"   "9"
unique(data$company_size) # "Others"    "50-99"     "<10"       "10000+"    "5000-9999" "1000-4999" "10/49"     "100-500"   "500-999" 
## [1] "500-999"   "100-500"   "50-99"     "1000-4999" "10000+"    "Other"    
## [7] "10/49"     "5000-9999" "<10"
unique(data$company_type) # "Pvt Ltd"             "Funded Startup"      "Early Stage Startup" "Other"               "Public Sector"       "NGO"              
## [1] "Pvt Ltd"             "Funded Startup"      "Public Sector"      
## [4] "NGO"                 "Early Stage Startup" "Other"
unique(data$jobs) # "1"     ">4"    "never" "4"     "3"     "2"    
## [1] ">4"    "2"     "1"     "3"     "4"     "never"

数据合并

对于数据量较小的样本,直接合并到Other里面去。

# 性别 男性远远大于女性
barplot(table(data$gender), xlab="gender", col=rainbow(3, alpha = 0.4))

# 查看总样本占比
gender_data <- data %>% 
  group_by(gender) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
gender_data$label <- scales::percent(gender_data$per)
label_gender = c("Fmale", "Male", "Other")
ggplot(data=gender_data)+
  geom_bar(aes(x="", y=per, fill=label_gender), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(gender))) + geom_bar(position = "fill")

# 把Female改成Other
data[which(data$gender=="Female"),"gender"] = "Other" 
data_train[which(data_train$gender=="Female"),"gender"] = "Other"  

# 重新查看处理后的效果
barplot(table(data$gender),xlab="gender", col=rainbow(3, alpha = 0.4))

gender_data <- data %>% 
  group_by(gender) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
gender_data$label <- scales::percent(gender_data$per)
label_gender = c("Male", "Other")
ggplot(data=gender_data)+
  geom_bar(aes(x="", y=per, fill=label_gender), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

ggplot(data, aes(factor(target), fill = factor(gender))) + geom_bar(position = "fill")

# 相关经验 有相关工作经验的比较多
barplot(table(data$expenience),xlab="expenience", col=rainbow(2, alpha = 0.4))

# 查看总样本占比
expenience_data <- data %>% 
  group_by(expenience) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
expenience_data$label <- scales::percent(expenience_data$per)
label_expenience = c("Has relevent experience", "No relevent experience")
ggplot(data=expenience_data)+
  geom_bar(aes(x="", y=per, fill=label_expenience), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(expenience))) + geom_bar(position = "fill")

# 大学入学情况 很多都没有上大学
barplot(table(data$university),xlab="university", col=rainbow(3, alpha = 0.4))

# 查看总样本占比
university_data <- data %>% 
  group_by(university) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
university_data$label <- scales::percent(university_data$per)
label_university = c("Full time course", "No enrollment",  "Part time course")
ggplot(data=university_data)+
  geom_bar(aes(x="", y=per, fill=label_university), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(university))) + geom_bar(position = "fill")

# 把Fulltime和Partime合并,改成enrollment
data[which(data$university=="Full time course" | data$university=="Part time course"),"university"] = "enrollment"
data_train[which(data_train$university=="Full time course" | data_train$university=="Part time course"),"university"] = university_med 

# 大学入学情况 很多都没有上大学
barplot(table(data$university),xlab="university", col=rainbow(3, alpha = 0.4))

# 查看总样本占比
university_data <- data %>% 
  group_by(university) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
university_data$label <- scales::percent(university_data$per)
label_university = c("Enrollment", "No enrollment")
ggplot(data=university_data)+
  geom_bar(aes(x="", y=per, fill=label_university), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(university))) + geom_bar(position = "fill")

# 教育水平
barplot(table(data$edu_level),xlab="edu_level", col=rainbow(5, alpha = 0.4))

# 查看总样本占比
edu_level_data <- data %>% 
  group_by(edu_level) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
edu_level_data$label <- scales::percent(edu_level_data$per)
label_edu_level = c("Gradule", "High School", "Masters", "Phd", "Primay School")
ggplot(data=edu_level_data)+
  geom_bar(aes(x="", y=per, fill=label_edu_level), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(edu_level))) + geom_bar(position = "fill")

# 专业大多数都是理工科
barplot(table(data$major),xlab="major", col=rainbow(6, alpha = 0.4))

# 查看总样本占比
major_data <- data %>% 
  group_by(major) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
major_data$label <- scales::percent(major_data$per)
label_major = c("Arts", "Business Degree", "Humanities", "No Major", "Other", "STEM" )
ggplot(data=major_data)+
  geom_bar(aes(x="", y=per, fill=label_major), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(major))) + geom_bar(position = "fill")

# 将其他专业数据都合并到Other    
data[which(data$major=="Business Degree" | data$major=="Arts" | data$major=="Humanities" | data$major=="No Major"),"major"] = "Other"
data_train[which(data_train$major=="Business Degree" | data_train$major=="Arts" | data_train$major=="Humanities" | data_train$major=="No Major"),"major"] = "Other"
# 查看处理后的数据
barplot(table(data$major),xlab="major", col=rainbow(6, alpha = 0.4))

# 查看总样本占比
major_data <- data %>% 
  group_by(major) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
major_data$label <- scales::percent(major_data$per)
label_major = c("Other", "STEM" )
ggplot(data=major_data)+
  geom_bar(aes(x="", y=per, fill=label_major), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(major))) + geom_bar(position = "fill")

# 工作经验,基本上都是STEM理工科
barplot(table(data$expenience_year),xlab="expenience year", col=rainbow(21, alpha = 0.4))

# 通过分析,可以考虑将工作经验做离散化处理
data[which(data$expenience_year=="<1"),"expenience_year"] = 0 # 1年以下工作经验归到0,认为没有工作经验
data[which(data$expenience_year==">20"),"expenience_year"] = 21 # 大于20用虚拟21来代替

# 排序查看数据分别
data$expenience_year=as.numeric(data$expenience_year)
barplot(table(sort(data$expenience_year)),xlab="expenience year", col=rainbow(21, alpha = 0.4))

ggplot(data, aes(factor(target), fill = factor(expenience_year))) + geom_bar(position = "fill")

# 按照1-5,5-10,10-20,20+ 四个段进行数据划分
data$expenience_year_new = 1 * (data$expenience_year<1) + 2 * (data$expenience_year>=1 & data$expenience_year<=4) + 3 * (data$expenience_year >4)
barplot(table(sort(data$expenience_year_new)),xlab="expenience year", col=rainbow(3, alpha = 0.4))

expenience_year_new_data <- data %>% 
  group_by(expenience_year_new) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
expenience_year_new_data$label <- scales::percent(expenience_year_new_data$per)

label_expenience_year_new = c( "1-4", "4+", "Never" )
ggplot(data=expenience_year_new_data)+
  geom_bar(aes(x="", y=per, fill=label_expenience_year_new), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(expenience_year_new))) + geom_bar(position = "fill")

# 测试数据
data_train[which(data_train$expenience_year=="<1"),"expenience_year"] = 0
data_train[which(data_train$expenience_year==">20"),"expenience_year"] = 21
# 排序查看数据分别
data_train$expenience_year=as.numeric(data_train$expenience_year)
# 按照1-5,5-10,10-20,20+ 四个段进行数据划分
data_train$expenience_year_new = 1 * (data_train$expenience_year<1) + 2 * (data_train$expenience_year>=1 & data_train$expenience_year<=4) + 3 * (data_train$expenience_year >4)
# 公司大小
barplot(table(data$company_size),xlab="company size", col=rainbow(9, alpha = 0.4))

company_size_data <- data %>% 
  group_by(company_size) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
company_size_data$label <- scales::percent(company_size_data$per)
label_company_size = c("<10",  "10/49", "100-500", "1000-4999", "10000+", "50-99", "500-999",  "5000-9999", "Other")
ggplot(data=company_size_data)+
  geom_bar(aes(x="", y=per, fill=label_company_size), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(company_size))) + geom_bar(position = "fill")

# 公司类型
barplot(table(data$company_type),xlab="company type", col=rainbow(6, alpha = 0.4))

company_type_data <- data %>% 
  group_by(company_type) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
company_type_data$label <- scales::percent(company_type_data$per)
label_company_size = c("Early Stage Startup",  "Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd")
ggplot(data=company_type_data)+
  geom_bar(aes(x="", y=per, fill=label_company_size), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(company_type))) + geom_bar(position = "fill")

# 数据处理
data[which(data$company_type=="Funded Startup" | data$company_type=="Early Stage Startup" | data$company_type=="Public Sector" | data$company_type=="NGO"),"company_type"] = "Other"
data_train[which(data_train$company_type=="Funded Startup" | data_train$company_type=="Early Stage Startup" | data_train$company_type=="Public Sector" | data_train$company_type=="NGO"),"company_type"] = "Other"
# 查看数据情况
barplot(table(data$company_type),xlab="company type", col=rainbow(6, alpha = 0.4))

company_type_data <- data %>% 
  group_by(company_type) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
company_type_data$label <- scales::percent(company_type_data$per)
label_company_type = c("Other", "Pvt Ltd")
ggplot(data=company_type_data)+
  geom_bar(aes(x="", y=per, fill=label_company_type), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

ggplot(data, aes(factor(target), fill = factor(company_type))) + geom_bar(position = "fill")

# 工作更换次数
barplot(table(data$jobs),xlab="jobs", col=rainbow(6, alpha = 0.4))

jobs_data <- data %>% 
  group_by(jobs) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(per=`n`/sum(`n`)) 
jobs_data$label <- scales::percent(jobs_data$per)
label_jobs = c(">4",  "1", "2", "3", "4", "Never")
ggplot(data=jobs_data)+
  geom_bar(aes(x="", y=per, fill=label_jobs), stat="identity", width = 1)+
  coord_polar("y", start=0)+
  theme_void()+
  geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))

# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(jobs))) + geom_bar(position = "fill")

# 数字类型数据情况
barplot(table(data$train_hours),xlab="train hours", col=rainbow(100, alpha = 0.4))

# 未take log之前,离群值较多
boxplot(train_hours~target, data, ylab="train hours",xlab="target", col=rainbow(2, alpha = 0.4), outline=TRUE)

# take log后,数据变的比较规范
boxplot(log(train_hours)~target, data, ylab="train hours",xlab="target", col=rainbow(2, alpha = 0.4), outline=TRUE)

导出处理后的数据

write.csv(data, "aug_train_after.csv", row.names = FALSE)

预处理后的数据情况

descrb("gender")
##           N        MU        SD MIN MED MAX
## Male  15890 0.2481435 0.4319491   0   0   1
## Other  1268 0.2594637 0.4385133   0   0   1
descrb("expenience")
##                             N        MU        SD MIN MED MAX
## Has relevent experience 12354 0.2133722 0.4097049   0   0   1
## No relevent experience   4804 0.3405495 0.4739434   0   0   1
descrb("university")
##                   N        MU        SD MIN MED MAX
## enrollment     4457 0.3479919 0.4763869   0   0   1
## no_enrollment 12701 0.2142351 0.4103068   0   0   1
descrb("edu_level")
##                    N        MU        SD MIN MED MAX
## Graduate       10776 0.2769117 0.4474932   0   0   1
## High School     1805 0.1961219 0.3971719   0   0   1
## Masters         3934 0.2155567 0.4112603   0   0   1
## Phd              368 0.1385870 0.3459854   0   0   1
## Primary School   275 0.1272727 0.3338859   0   0   1
descrb("major")
##           N        MU        SD MIN MED MAX
## Other  1663 0.2423331 0.4286237   0   0   1
## STEM  15495 0.2496934 0.4328495   0   0   1
descrb("expenience_year_new")
##       N        MU        SD MIN MED MAX
## 1   478 0.4518828 0.4982008   0   0   1
## 2  3939 0.3445037 0.4752665   0   0   1
## 3 12741 0.2118358 0.4086251   0   0   1
descrb("company_size")
##              N        MU        SD MIN MED MAX
## <10       1180 0.1703390 0.3760897   0   0   1
## 10/49     1316 0.2355623 0.4245111   0   0   1
## 100-500   2293 0.1604884 0.3671385   0   0   1
## 1000-4999 1201 0.1507077 0.3579128   0   0   1
## 10000+    1796 0.1932071 0.3949240   0   0   1
## 50-99     2767 0.1752801 0.3802752   0   0   1
## 500-999    789 0.1711027 0.3768375   0   0   1
## 5000-9999  511 0.1819961 0.3862194   0   0   1
## Other     5305 0.4056550 0.4910646   0   0   1
descrb("company_type")
##             N        MU        SD MIN MED MAX
## Other    2845 0.1926186 0.3944254   0   0   1
## Pvt Ltd 14313 0.2601831 0.4387497   0   0   1
descrb("jobs")
##          N        MU        SD MIN MED MAX
## >4    2961 0.1789936 0.3834117   0   0   1
## 1     7588 0.2688455 0.4433887   0   0   1
## 2     2591 0.2392898 0.4267324   0   0   1
## 3      921 0.2280130 0.4197790   0   0   1
## 4      925 0.2259459 0.4184300   0   0   1
## never 2172 0.3052486 0.4606187   0   0   1
descrb("train_hours")
##       N         MU        SD MIN MED MAX
## 1     9 0.22222222 0.4409586   0 0.0   1
## 2    89 0.23595506 0.4269999   0 0.0   1
## 3   102 0.26470588 0.4433551   0 0.0   1
## 4   194 0.17010309 0.3766955   0 0.0   1
## 5    97 0.23711340 0.4275218   0 0.0   1
## 6   235 0.25957447 0.4393369   0 0.0   1
## 7   185 0.29189189 0.4558666   0 0.0   1
## 8   191 0.23036649 0.4221740   0 0.0   1
## 9   208 0.30769231 0.4626519   0 0.0   1
## 10  228 0.27631579 0.4481588   0 0.0   1
## 11  209 0.20574163 0.4052131   0 0.0   1
## 12  265 0.20754717 0.4063180   0 0.0   1
## 13  194 0.31443299 0.4654909   0 0.0   1
## 14  192 0.24479167 0.4310877   0 0.0   1
## 15  204 0.25000000 0.4340779   0 0.0   1
## 16  168 0.26785714 0.4441666   0 0.0   1
## 17  243 0.25925926 0.4391326   0 0.0   1
## 18  264 0.25000000 0.4338351   0 0.0   1
## 19  143 0.27972028 0.4504394   0 0.0   1
## 20  246 0.23170732 0.4227832   0 0.0   1
## 21  230 0.27391304 0.4469376   0 0.0   1
## 22  250 0.22800000 0.4203842   0 0.0   1
## 23  237 0.23628692 0.4256995   0 0.0   1
## 24  238 0.25630252 0.4375109   0 0.0   1
## 25  184 0.27717391 0.4488244   0 0.0   1
## 26  229 0.27510917 0.4475475   0 0.0   1
## 27  102 0.26470588 0.4433551   0 0.0   1
## 28  293 0.22866894 0.4206941   0 0.0   1
## 29  160 0.26250000 0.4413744   0 0.0   1
## 30  168 0.25595238 0.4376998   0 0.0   1
## 31  165 0.20606061 0.4057058   0 0.0   1
## 32  182 0.30769231 0.4628117   0 0.0   1
## 33  135 0.23703704 0.4268490   0 0.0   1
## 34  235 0.26382979 0.4416488   0 0.0   1
## 35  142 0.23239437 0.4238542   0 0.0   1
## 36  189 0.26455026 0.4422650   0 0.0   1
## 37  146 0.23287671 0.4241193   0 0.0   1
## 38  108 0.25000000 0.4350314   0 0.0   1
## 39  160 0.28125000 0.4510209   0 0.0   1
## 40  172 0.28488372 0.4526769   0 0.0   1
## 41  128 0.17968750 0.3854355   0 0.0   1
## 42  210 0.23809524 0.4269354   0 0.0   1
## 43  179 0.28491620 0.4526407   0 0.0   1
## 44  183 0.26775956 0.4440064   0 0.0   1
## 45  156 0.27564103 0.4482758   0 0.0   1
## 46  202 0.25742574 0.4383021   0 0.0   1
## 47  134 0.24626866 0.4324535   0 0.0   1
## 48  215 0.26511628 0.4424252   0 0.0   1
## 49   45 0.24444444 0.4346135   0 0.0   1
## 50  247 0.30364372 0.4607643   0 0.0   1
## 51  165 0.32121212 0.4683635   0 0.0   1
## 52  178 0.21348315 0.4109218   0 0.0   1
## 53  117 0.26495726 0.4432086   0 0.0   1
## 54  147 0.22448980 0.4186724   0 0.0   1
## 55  153 0.20915033 0.4080376   0 0.0   1
## 56  231 0.29004329 0.4547675   0 0.0   1
## 57  127 0.26771654 0.4445226   0 0.0   1
## 58  122 0.27049180 0.4460457   0 0.0   1
## 59   62 0.16129032 0.3708010   0 0.0   1
## 60   85 0.21176471 0.4109837   0 0.0   1
## 61   90 0.23333333 0.4253221   0 0.0   1
## 62  114 0.30701754 0.4632932   0 0.0   1
## 63   70 0.28571429 0.4550158   0 0.0   1
## 64  127 0.23622047 0.4264414   0 0.0   1
## 65   71 0.21126761 0.4111132   0 0.0   1
## 66  103 0.20388350 0.4048535   0 0.0   1
## 67   86 0.23255814 0.4249406   0 0.0   1
## 68  104 0.33653846 0.4748137   0 0.0   1
## 69   73 0.21917808 0.4165525   0 0.0   1
## 70  121 0.17355372 0.3803000   0 0.0   1
## 71   15 0.13333333 0.3518658   0 0.0   1
## 72  132 0.20454545 0.4049057   0 0.0   1
## 73   56 0.23214286 0.4260205   0 0.0   1
## 74  111 0.23423423 0.4254400   0 0.0   1
## 75   51 0.29411765 0.4601790   0 0.0   1
## 76   72 0.29166667 0.4577194   0 0.0   1
## 77   79 0.24050633 0.4301219   0 0.0   1
## 78  150 0.36000000 0.4816080   0 0.0   1
## 79   58 0.20689655 0.4086186   0 0.0   1
## 80  130 0.24615385 0.4324357   0 0.0   1
## 81   55 0.23636364 0.4287638   0 0.0   1
## 82   87 0.19540230 0.3988087   0 0.0   1
## 83   75 0.18666667 0.3922676   0 0.0   1
## 84  100 0.22000000 0.4163332   0 0.0   1
## 85   54 0.16666667 0.3761774   0 0.0   1
## 86   86 0.20930233 0.4091966   0 0.0   1
## 87   49 0.36734694 0.4870779   0 0.0   1
## 88   77 0.28571429 0.4547163   0 0.0   1
## 89   61 0.26229508 0.4435328   0 0.0   1
## 90  102 0.19607843 0.3989892   0 0.0   1
## 91   63 0.28571429 0.4553826   0 0.0   1
## 92   91 0.20879121 0.4086967   0 0.0   1
## 94  107 0.22429907 0.4190828   0 0.0   1
## 95   34 0.32352941 0.4748581   0 0.0   1
## 96  112 0.27678571 0.4494205   0 0.0   1
## 97   38 0.15789474 0.3695370   0 0.0   1
## 98   68 0.19117647 0.3961514   0 0.0   1
## 99   42 0.21428571 0.4152997   0 0.0   1
## 100 100 0.22000000 0.4163332   0 0.0   1
## 101  34 0.29411765 0.4624973   0 0.0   1
## 102 126 0.32539683 0.4703933   0 0.0   1
## 103  29 0.27586207 0.4548588   0 0.0   1
## 104  59 0.27118644 0.4483882   0 0.0   1
## 105  59 0.11864407 0.3261450   0 0.0   1
## 106  87 0.19540230 0.3988087   0 0.0   1
## 107  53 0.33962264 0.4781131   0 0.0   1
## 108  83 0.20481928 0.4060228   0 0.0   1
## 109  53 0.30188679 0.4634696   0 0.0   1
## 110  67 0.23880597 0.4295717   0 0.0   1
## 111  55 0.34545455 0.4798990   0 0.0   1
## 112  88 0.29545455 0.4588614   0 0.0   1
## 113  40 0.20000000 0.4050957   0 0.0   1
## 114  61 0.31147541 0.4669398   0 0.0   1
## 116  56 0.35714286 0.4834938   0 0.0   1
## 117  21 0.33333333 0.4830459   0 0.0   1
## 118  36 0.33333333 0.4780914   0 0.0   1
## 119  18 0.27777778 0.4608886   0 0.0   1
## 120  13 0.15384615 0.3755338   0 0.0   1
## 121  15 0.20000000 0.4140393   0 0.0   1
## 122  45 0.22222222 0.4204375   0 0.0   1
## 123  17 0.11764706 0.3321056   0 0.0   1
## 124  39 0.25641026 0.4423590   0 0.0   1
## 125  21 0.23809524 0.4364358   0 0.0   1
## 126  32 0.12500000 0.3360108   0 0.0   1
## 127  19 0.15789474 0.3746343   0 0.0   1
## 128  35 0.22857143 0.4260430   0 0.0   1
## 129  19 0.21052632 0.4188539   0 0.0   1
## 130  49 0.18367347 0.3912304   0 0.0   1
## 131  22 0.18181818 0.3947710   0 0.0   1
## 132  28 0.25000000 0.4409586   0 0.0   1
## 133  21 0.14285714 0.3585686   0 0.0   1
## 134  52 0.26923077 0.4478876   0 0.0   1
## 135  23 0.17391304 0.3875534   0 0.0   1
## 136  35 0.28571429 0.4583492   0 0.0   1
## 138  38 0.21052632 0.4131550   0 0.0   1
## 139  21 0.33333333 0.4830459   0 0.0   1
## 140  39 0.28205128 0.4558808   0 0.0   1
## 141  21 0.19047619 0.4023739   0 0.0   1
## 142  15 0.26666667 0.4577377   0 0.0   1
## 143  18 0.11111111 0.3233808   0 0.0   1
## 144  41 0.26829268 0.4485750   0 0.0   1
## 145  25 0.32000000 0.4760952   0 0.0   1
## 146  42 0.26190476 0.4450006   0 0.0   1
## 147  14 0.21428571 0.4258153   0 0.0   1
## 148  30 0.23333333 0.4301831   0 0.0   1
## 149  24 0.12500000 0.3378320   0 0.0   1
## 150  38 0.31578947 0.4710691   0 0.0   1
## 151  18 0.33333333 0.4850713   0 0.0   1
## 152  43 0.25581395 0.4414814   0 0.0   1
## 153  14 0.35714286 0.4972452   0 0.0   1
## 154  34 0.14705882 0.3594906   0 0.0   1
## 155  21 0.23809524 0.4364358   0 0.0   1
## 156  47 0.34042553 0.4789752   0 0.0   1
## 157  26 0.26923077 0.4523443   0 0.0   1
## 158  41 0.14634146 0.3578390   0 0.0   1
## 160  48 0.18750000 0.3944428   0 0.0   1
## 161  15 0.13333333 0.3518658   0 0.0   1
## 162  32 0.21875000 0.4200134   0 0.0   1
## 163  25 0.20000000 0.4082483   0 0.0   1
## 164  18 0.22222222 0.4277926   0 0.0   1
## 165  15 0.26666667 0.4577377   0 0.0   1
## 166  59 0.25423729 0.4391693   0 0.0   1
## 167  15 0.20000000 0.4140393   0 0.0   1
## 168  33 0.36363636 0.4885042   0 0.0   1
## 170  24 0.12500000 0.3378320   0 0.0   1
## 172  16 0.18750000 0.4031129   0 0.0   1
## 174  26 0.34615385 0.4851645   0 0.0   1
## 176  17 0.17647059 0.3929526   0 0.0   1
## 178  28 0.25000000 0.4409586   0 0.0   1
## 180  28 0.21428571 0.4178554   0 0.0   1
## 182  34 0.26470588 0.4478111   0 0.0   1
## 184  24 0.29166667 0.4643056   0 0.0   1
## 188  25 0.32000000 0.4760952   0 0.0   1
## 190  15 0.20000000 0.4140393   0 0.0   1
## 192  37 0.35135135 0.4839775   0 0.0   1
## 194  18 0.16666667 0.3834825   0 0.0   1
## 196  26 0.26923077 0.4523443   0 0.0   1
## 198  21 0.23809524 0.4364358   0 0.0   1
## 200  22 0.18181818 0.3947710   0 0.0   1
## 202  18 0.22222222 0.4277926   0 0.0   1
## 204  30 0.16666667 0.3790490   0 0.0   1
## 206  22 0.13636364 0.3512501   0 0.0   1
## 210  26 0.19230769 0.4019185   0 0.0   1
## 212  14 0.21428571 0.4258153   0 0.0   1
## 214  30 0.23333333 0.4301831   0 0.0   1
## 216  15 0.26666667 0.4577377   0 0.0   1
## 218  20 0.20000000 0.4103913   0 0.0   1
## 220  15 0.26666667 0.4577377   0 0.0   1
## 222  29 0.17241379 0.3844259   0 0.0   1
## 224  20 0.15000000 0.3663475   0 0.0   1
## 226  18 0.22222222 0.4277926   0 0.0   1
## 228   6 0.50000000 0.5477226   0 0.5   1
## 232  14 0.14285714 0.3631365   0 0.0   1
## 234   4 0.00000000 0.0000000   0 0.0   0
## 236   7 0.00000000 0.0000000   0 0.0   0
## 238   4 0.00000000 0.0000000   0 0.0   0
## 240   5 0.20000000 0.4472136   0 0.0   1
## 242  11 0.00000000 0.0000000   0 0.0   0
## 244   8 0.25000000 0.4629100   0 0.0   1
## 246  12 0.16666667 0.3892495   0 0.0   1
## 248  11 0.27272727 0.4670994   0 0.0   1
## 250  12 0.16666667 0.3892495   0 0.0   1
## 254   9 0.11111111 0.3333333   0 0.0   1
## 256  13 0.00000000 0.0000000   0 0.0   0
## 258  12 0.41666667 0.5149287   0 0.0   1
## 260   8 0.12500000 0.3535534   0 0.0   1
## 262  10 0.10000000 0.3162278   0 0.0   1
## 264  15 0.13333333 0.3518658   0 0.0   1
## 266   6 0.16666667 0.4082483   0 0.0   1
## 268  11 0.45454545 0.5222330   0 0.0   1
## 270   7 0.42857143 0.5345225   0 0.0   1
## 272   5 0.20000000 0.4472136   0 0.0   1
## 276   5 0.00000000 0.0000000   0 0.0   0
## 278  13 0.00000000 0.0000000   0 0.0   0
## 280   6 0.33333333 0.5163978   0 0.0   1
## 282   8 0.25000000 0.4629100   0 0.0   1
## 284   7 0.28571429 0.4879500   0 0.0   1
## 286   5 0.60000000 0.5477226   0 1.0   1
## 288  11 0.09090909 0.3015113   0 0.0   1
## 290   7 0.57142857 0.5345225   0 1.0   1
## 292  10 0.30000000 0.4830459   0 0.0   1
## 294   6 0.00000000 0.0000000   0 0.0   0
## 298  13 0.46153846 0.5188745   0 0.0   1
## 300  11 0.09090909 0.3015113   0 0.0   1
## 302   8 0.37500000 0.5175492   0 0.0   1
## 304  12 0.16666667 0.3892495   0 0.0   1
## 306  11 0.18181818 0.4045199   0 0.0   1
## 308  14 0.28571429 0.4688072   0 0.0   1
## 310   6 0.00000000 0.0000000   0 0.0   0
## 312  11 0.09090909 0.3015113   0 0.0   1
## 314  12 0.16666667 0.3892495   0 0.0   1
## 316  11 0.27272727 0.4670994   0 0.0   1
## 320   9 0.11111111 0.3333333   0 0.0   1
## 322  12 0.08333333 0.2886751   0 0.0   1
## 324   9 0.22222222 0.4409586   0 0.0   1
## 326  10 0.00000000 0.0000000   0 0.0   0
## 328  10 0.20000000 0.4216370   0 0.0   1
## 330  10 0.10000000 0.3162278   0 0.0   1
## 332  10 0.40000000 0.5163978   0 0.0   1
## 334  11 0.18181818 0.4045199   0 0.0   1
## 336  11 0.27272727 0.4670994   0 0.0   1

模型创建

# 建立回归模型
# 空模型
model.empty=glm(target~1,family=binomial(link=logit), data=data)    
summary(model.empty)
## 
## Call:
## glm(formula = target ~ 1, family = binomial(link = logit), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7567  -0.7567  -0.7567  -0.7567   1.6676  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.10406    0.01765  -62.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19259  on 17157  degrees of freedom
## Residual deviance: 19259  on 17157  degrees of freedom
## AIC: 19261
## 
## Number of Fisher Scoring iterations: 4
# 全模型
model.full=glm(target~city_dev + as.factor(gender) + as.factor(expenience) + as.factor(university) + as.factor(edu_level) + as.factor(major) + as.factor(expenience_year_new) + as.factor(company_size) + as.factor(company_type) + as.factor(jobs) + train_hours,family=binomial(link=logit),data=data) 
summary(model.full)
## 
## Call:
## glm(formula = target ~ city_dev + as.factor(gender) + as.factor(expenience) + 
##     as.factor(university) + as.factor(edu_level) + as.factor(major) + 
##     as.factor(expenience_year_new) + as.factor(company_size) + 
##     as.factor(company_type) + as.factor(jobs) + train_hours, 
##     family = binomial(link = logit), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1736  -0.6840  -0.4640  -0.2109   2.7797  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  3.8608333  0.2068226  18.667
## city_dev                                    -6.0094390  0.1588004 -37.843
## as.factor(gender)Other                       0.1760703  0.0745309   2.362
## as.factor(expenience)No relevent experience  0.2598113  0.0522554   4.972
## as.factor(university)no_enrollment          -0.2245314  0.0468793  -4.790
## as.factor(edu_level)High School             -0.9418252  0.0750823 -12.544
## as.factor(edu_level)Masters                 -0.1794383  0.0492573  -3.643
## as.factor(edu_level)Phd                     -0.5081710  0.1672069  -3.039
## as.factor(edu_level)Primary School          -1.4030985  0.1981243  -7.082
## as.factor(major)STEM                        -0.0778109  0.0686363  -1.134
## as.factor(expenience_year_new)2             -0.1739321  0.1103607  -1.576
## as.factor(expenience_year_new)3             -0.3552849  0.1099728  -3.231
## as.factor(company_size)10/49                 0.3175236  0.1088392   2.917
## as.factor(company_size)100-500              -0.0328439  0.1031412  -0.318
## as.factor(company_size)1000-4999             0.0761153  0.1197424   0.636
## as.factor(company_size)10000+                0.3110905  0.1065759   2.919
## as.factor(company_size)50-99                 0.0387523  0.0991545   0.391
## as.factor(company_size)500-999               0.0690498  0.1308882   0.528
## as.factor(company_size)5000-9999             0.2637057  0.1477820   1.784
## as.factor(company_size)Other                 1.4180197  0.0938440  15.110
## as.factor(company_type)Pvt Ltd              -0.0637367  0.0597014  -1.068
## as.factor(jobs)1                             0.0575809  0.0624454   0.922
## as.factor(jobs)2                             0.1387070  0.0735944   1.885
## as.factor(jobs)3                             0.1566176  0.1005800   1.557
## as.factor(jobs)4                             0.2277876  0.1008498   2.259
## as.factor(jobs)never                        -0.4914202  0.0845609  -5.811
## train_hours                                 -0.0008962  0.0003351  -2.675
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## city_dev                                     < 2e-16 ***
## as.factor(gender)Other                       0.01816 *  
## as.factor(expenience)No relevent experience 6.63e-07 ***
## as.factor(university)no_enrollment          1.67e-06 ***
## as.factor(edu_level)High School              < 2e-16 ***
## as.factor(edu_level)Masters                  0.00027 ***
## as.factor(edu_level)Phd                      0.00237 ** 
## as.factor(edu_level)Primary School          1.42e-12 ***
## as.factor(major)STEM                         0.25693    
## as.factor(expenience_year_new)2              0.11502    
## as.factor(expenience_year_new)3              0.00124 ** 
## as.factor(company_size)10/49                 0.00353 ** 
## as.factor(company_size)100-500               0.75015    
## as.factor(company_size)1000-4999             0.52500    
## as.factor(company_size)10000+                0.00351 ** 
## as.factor(company_size)50-99                 0.69593    
## as.factor(company_size)500-999               0.59781    
## as.factor(company_size)5000-9999             0.07435 .  
## as.factor(company_size)Other                 < 2e-16 ***
## as.factor(company_type)Pvt Ltd               0.28570    
## as.factor(jobs)1                             0.35648    
## as.factor(jobs)2                             0.05946 .  
## as.factor(jobs)3                             0.11944    
## as.factor(jobs)4                             0.02390 *  
## as.factor(jobs)never                        6.19e-09 ***
## train_hours                                  0.00748 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19259  on 17157  degrees of freedom
## Residual deviance: 16043  on 17131  degrees of freedom
## AIC: 16097
## 
## Number of Fisher Scoring iterations: 4
#计算空模型和全模型的方差分析
anova(model.empty, model.full)                          
## Analysis of Deviance Table
## 
## Model 1: target ~ 1
## Model 2: target ~ city_dev + as.factor(gender) + as.factor(expenience) + 
##     as.factor(university) + as.factor(edu_level) + as.factor(major) + 
##     as.factor(expenience_year_new) + as.factor(company_size) + 
##     as.factor(company_type) + as.factor(jobs) + train_hours
##   Resid. Df Resid. Dev Df Deviance
## 1     17157      19258            
## 2     17131      16043 26   3215.7
19258-16003 # 似然函数,残差平方和
## [1] 3255
1-pchisq(3255.7, df=26) # P值为零,说明至少有一个变量起作用
## [1] 0
library(car) #载入程序包car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# Type III SS 在软件里一般显示为Adjust SS,指的是,将p个变量纳入回归模型后,各个变量的额外贡献度(独立贡献度),
# 一般来说,各个变量的SS之和是小于SSR的,仅当各个变量完全不相关时,各个变量的SS的和才等于SSR。相应地,可以求出Type III r^{2},即:
Anova(model.full, type="III") #对模型做三型方差分析
## Analysis of Deviance Table (Type III tests)
## 
## Response: target
##                                LR Chisq Df Pr(>Chisq)    
## city_dev                        1517.56  1  < 2.2e-16 ***
## as.factor(gender)                  5.48  1   0.019182 *  
## as.factor(expenience)             24.51  1  7.401e-07 ***
## as.factor(university)             22.75  1  1.843e-06 ***
## as.factor(edu_level)             219.92  4  < 2.2e-16 ***
## as.factor(major)                   1.28  1   0.258672    
## as.factor(expenience_year_new)    19.20  2  6.786e-05 ***
## as.factor(company_size)          788.08  8  < 2.2e-16 ***
## as.factor(company_type)            1.13  1   0.286919    
## as.factor(jobs)                   85.55  5  < 2.2e-16 ***
## train_hours                        7.25  1   0.007093 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 模型选择
# AIC模型
model.aic=step(model.full,trace=F)
summary(model.aic)
## 
## Call:
## glm(formula = target ~ city_dev + as.factor(gender) + as.factor(expenience) + 
##     as.factor(university) + as.factor(edu_level) + as.factor(expenience_year_new) + 
##     as.factor(company_size) + as.factor(jobs) + train_hours, 
##     family = binomial(link = logit), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1762  -0.6848  -0.4635  -0.2121   2.7759  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  3.7314437  0.1885195  19.793
## city_dev                                    -5.9798731  0.1574074 -37.990
## as.factor(gender)Other                       0.1844614  0.0743442   2.481
## as.factor(expenience)No relevent experience  0.2668466  0.0520867   5.123
## as.factor(university)no_enrollment          -0.2211558  0.0466287  -4.743
## as.factor(edu_level)High School             -0.9516658  0.0746569 -12.747
## as.factor(edu_level)Masters                 -0.1772305  0.0492190  -3.601
## as.factor(edu_level)Phd                     -0.4956321  0.1666211  -2.975
## as.factor(edu_level)Primary School          -1.4147567  0.1978204  -7.152
## as.factor(expenience_year_new)2             -0.1769660  0.1103637  -1.603
## as.factor(expenience_year_new)3             -0.3618494  0.1098322  -3.295
## as.factor(company_size)10/49                 0.3104811  0.1086561   2.857
## as.factor(company_size)100-500              -0.0466643  0.1024966  -0.455
## as.factor(company_size)1000-4999             0.0581195  0.1187263   0.490
## as.factor(company_size)10000+                0.2869538  0.1047285   2.740
## as.factor(company_size)50-99                 0.0244237  0.0984360   0.248
## as.factor(company_size)500-999               0.0538645  0.1302738   0.413
## as.factor(company_size)5000-9999             0.2446250  0.1470373   1.664
## as.factor(company_size)Other                 1.3925382  0.0905437  15.380
## as.factor(jobs)1                             0.0586842  0.0622992   0.942
## as.factor(jobs)2                             0.1405328  0.0735056   1.912
## as.factor(jobs)3                             0.1573387  0.1005466   1.565
## as.factor(jobs)4                             0.2280631  0.1007859   2.263
## as.factor(jobs)never                        -0.4960657  0.0844113  -5.877
## train_hours                                 -0.0008947  0.0003350  -2.671
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## city_dev                                     < 2e-16 ***
## as.factor(gender)Other                      0.013095 *  
## as.factor(expenience)No relevent experience 3.01e-07 ***
## as.factor(university)no_enrollment          2.11e-06 ***
## as.factor(edu_level)High School              < 2e-16 ***
## as.factor(edu_level)Masters                 0.000317 ***
## as.factor(edu_level)Phd                     0.002934 ** 
## as.factor(edu_level)Primary School          8.57e-13 ***
## as.factor(expenience_year_new)2             0.108829    
## as.factor(expenience_year_new)3             0.000986 ***
## as.factor(company_size)10/49                0.004270 ** 
## as.factor(company_size)100-500              0.648910    
## as.factor(company_size)1000-4999            0.624470    
## as.factor(company_size)10000+               0.006144 ** 
## as.factor(company_size)50-99                0.804043    
## as.factor(company_size)500-999              0.679261    
## as.factor(company_size)5000-9999            0.096174 .  
## as.factor(company_size)Other                 < 2e-16 ***
## as.factor(jobs)1                            0.346206    
## as.factor(jobs)2                            0.055894 .  
## as.factor(jobs)3                            0.117622    
## as.factor(jobs)4                            0.023645 *  
## as.factor(jobs)never                        4.18e-09 ***
## train_hours                                 0.007574 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19259  on 17157  degrees of freedom
## Residual deviance: 16045  on 17133  degrees of freedom
## AIC: 16095
## 
## Number of Fisher Scoring iterations: 4
# BIC模型
ss=length(data[,1])
model.bic=step(model.full,trace=F,k=log(ss))
summary(model.bic)
## 
## Call:
## glm(formula = target ~ city_dev + as.factor(expenience) + as.factor(university) + 
##     as.factor(edu_level) + as.factor(expenience_year_new) + as.factor(company_size) + 
##     as.factor(jobs), family = binomial(link = logit), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2130  -0.6827  -0.4640  -0.2134   2.7889  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                  3.67369    0.18718  19.626
## city_dev                                    -5.95251    0.15700 -37.915
## as.factor(expenience)No relevent experience  0.27010    0.05205   5.189
## as.factor(university)no_enrollment          -0.21964    0.04660  -4.713
## as.factor(edu_level)High School             -0.95901    0.07461 -12.853
## as.factor(edu_level)Masters                 -0.17200    0.04916  -3.499
## as.factor(edu_level)Phd                     -0.48637    0.16625  -2.925
## as.factor(edu_level)Primary School          -1.42518    0.19793  -7.200
## as.factor(expenience_year_new)2             -0.18549    0.11036  -1.681
## as.factor(expenience_year_new)3             -0.37586    0.10977  -3.424
## as.factor(company_size)10/49                 0.31309    0.10860   2.883
## as.factor(company_size)100-500              -0.04496    0.10250  -0.439
## as.factor(company_size)1000-4999             0.06477    0.11867   0.546
## as.factor(company_size)10000+                0.29080    0.10471   2.777
## as.factor(company_size)50-99                 0.02545    0.09842   0.259
## as.factor(company_size)500-999               0.05694    0.13024   0.437
## as.factor(company_size)5000-9999             0.24754    0.14705   1.683
## as.factor(company_size)Other                 1.39312    0.09052  15.390
## as.factor(jobs)1                             0.05821    0.06225   0.935
## as.factor(jobs)2                             0.13866    0.07346   1.888
## as.factor(jobs)3                             0.15650    0.10047   1.558
## as.factor(jobs)4                             0.22723    0.10071   2.256
## as.factor(jobs)never                        -0.49484    0.08435  -5.866
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## city_dev                                     < 2e-16 ***
## as.factor(expenience)No relevent experience 2.11e-07 ***
## as.factor(university)no_enrollment          2.44e-06 ***
## as.factor(edu_level)High School              < 2e-16 ***
## as.factor(edu_level)Masters                 0.000468 ***
## as.factor(edu_level)Phd                     0.003439 ** 
## as.factor(edu_level)Primary School          6.00e-13 ***
## as.factor(expenience_year_new)2             0.092793 .  
## as.factor(expenience_year_new)3             0.000617 ***
## as.factor(company_size)10/49                0.003941 ** 
## as.factor(company_size)100-500              0.660949    
## as.factor(company_size)1000-4999            0.585246    
## as.factor(company_size)10000+               0.005481 ** 
## as.factor(company_size)50-99                0.795930    
## as.factor(company_size)500-999              0.661951    
## as.factor(company_size)5000-9999            0.092294 .  
## as.factor(company_size)Other                 < 2e-16 ***
## as.factor(jobs)1                            0.349741    
## as.factor(jobs)2                            0.059081 .  
## as.factor(jobs)3                            0.119298    
## as.factor(jobs)4                            0.024048 *  
## as.factor(jobs)never                        4.46e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19259  on 17157  degrees of freedom
## Residual deviance: 16059  on 17135  degrees of freedom
## AIC: 16105
## 
## Number of Fisher Scoring iterations: 4
# 加载ROC类似
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
pred.full=predict(model.full,data=data)
pred.aic=predict(model.aic,data=data)
pred.bic=predict(model.bic,data=data)

roc.full=roc(data$target,pred.full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.aic=roc(data$target,pred.aic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.bic=roc(data$target,pred.bic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 查看3个模型的结果, 目前全模型效果最好 0.7812451 0.7811417 0.7805449
print(c(roc.full$auc,roc.aic$auc,roc.bic$auc))
## [1] 0.7812451 0.7811417 0.7805449
par(mfrow=c(1,3))
plot(roc.full,main="Full model", col=1)
plot(roc.aic,main="AIC", col=2)
plot(roc.bic,main="BIC", col=3)

模型预测

# 模型预测,先用AIC查看一下效果
par(mfrow=c(1,1))
data_train$company_size=as.factor(data_train$company_size)
p = predict(model.aic, data_train)  #利用模型aic对测试数据进行预测
p = exp(p) / (1+exp(p))                             #计算预测得到的概率
data_train$pred=1*(p>0.249)                             #以均值0.249为阈值生成预测值
table(data_train[,c("target","pred")])                              #计算预测值与真实值的2维频数表
##       pred
## target    0    1
##      0 1095  400
##      1  134  371
ngrids=500                                  #设置格点数为500
TPR=rep(0,ngrids)                               #为TPR(true positive ratio)赋初值
FPR=rep(0,ngrids)                               #为FPR(false positive ratio)赋初值
for(i in 1:ngrids){
  p0=i/ngrids;                                  #选取阈值p0
  ST.true=data_train$target         #取出真实值并赋值给ST.true
  ST.pred=1*(p>p0)                              #以p0为阈值生成预测值
  TPR[i]=sum(ST.pred*ST.true)/sum(ST.true)                  #计算TPR
  FPR[i]=sum(ST.pred*(1-ST.true))/sum(1-ST.true)                    #计算FPR
}
plot(FPR,TPR,type="l",col=2)                            #画出FPR与TPR的散点图,即ROC曲线
points(c(0,1),c(0,1),type="l",lty=2)                        #添加对角线

# 全模型 vs AIC vs BIC
p=matrix(0,length(data_train[,1]),3)                            #生成矩阵,用于存储各模型的预测值
p[,1]=predict(model.full,data_train)                            #利用全模型对数据进行预测
p[,2]=predict(model.aic,data_train)                         #利用模型logit.aic对数据进行预测
p[,3]=predict(model.bic,data_train)                         #利用模型logit.bic对数据进行预测
p[,c(1:3)]=exp(p[,c(1:3)])/(1+exp(p[,c(1:3)]))                  #计算预测得到的概率

plot(c(0,1),c(0,1),type="l",main="FPR vs. TPR",xlab="FPR",ylab="TPR")       #画图,生成基本框架

FPR=rep(0,ngrids)                               #为FPR赋初值
TPR=rep(0,ngrids)                               #为TPR赋初值
for(k in 1:3){
  prob=p[,k]                                #取出p中第K列的值,即第K个模型的预测概率
  for(i in 1:ngrids){
    p0=i/ngrids                         #选取阈值
    ST.hat=1*(prob>p0)                      #根据阈值生成预测值
    FPR[i]=sum((1-ST.true)*ST.hat)/sum(1-ST.true)           #计算FPR
    TPR[i]=sum(ST.true*ST.hat)/sum(ST.true)             #计算TPR
  }
  points(FPR,TPR,type="b",col=k,lty=k,pch=k)                #向图上添加第k个模型的TPR与FPR的散点图
}
legend(0.6,0.3,c("LOGIT FULL MODEL","LOGIT AIC MODEL", "LOGIT BIC MODEL"),lty=c(1:3),col=c(1:3),pch=c(1:3)) #为3个模型添加标示,区分3个模型